This document provides simple analysis of 5.5 hours of Taiwan Mandarin Spontaneous Speech Corpus.

library(readr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(gridExtra)

Read file

The corpus now contains 11 subjects of two age groups of people. The younger group (4M,4F) ages from 20-35, while the older group (2M, 1F) from 50-65. All of them are born and raised in Taipei. Each subject’s TextGrid files were transferred to csv files to have a better view of the word and syllable frequencies in spontaneous speech. This part reads the 11 files and pre-processes the syllable data and word data separately for later analysis. To focus only on the Taiwan Mandarin data, other foreign words, such as Southern Min and English, were not included in the following analysis.

multmerge <- function(mypath){
  filenames <- list.files(mypath, pattern = "_f.csv")
  age <- c("O", "Y", "Y", "Y", "O", "Y", "Y", "Y", "Y", "O","Y")
  gender <- c("M", "M", "F", "M", "M", "M", "F", "F", "F", "F", "M")
  
  datalist <- Map(function(x, y, z){
    f <- read_csv(x, col_types = cols(X1 = col_skip()))
    f1 <- f %>%
      mutate(id = x, age = y, gender = z)
    return(f1)}, filenames, age, gender)
  Reduce(function(x,y) {bind_rows(x,y)}, datalist)
}

mycorpus <- multmerge("./")

nonchi_syll <- mycorpus$syll_c[str_detect(mycorpus$syll_c,"[a-zA-Z]")]
nonchi_word <- mycorpus$word_c[str_detect(mycorpus$word_c,"[a-zA-Z]")]

syll <- mycorpus %>%
  select(syll_e, syll_c, id, age, gender) %>%
  filter(!is.na(syll_e)) %>%
  filter(!syll_c %in% nonchi_syll)

word <- mycorpus %>%
  select(word_e, word_c, id, age, gender) %>%
  filter(!is.na(word_e)) %>%
  filter(!word_c %in% nonchi_word)

Syllable frequency

This part presents frequencies in syllable level. There are 82178 tokens of syllable in total.

1. Syllables with tones

After filtering the non-chinese syllables, this figure presents the top 20 used syllables with tones out of 1007 types in Taiwan Mandarin spontaneous speech.

syll_e <- syll %>%
  group_by(syll_e) %>%
  count() %>%
  arrange(desc(n))%>%
  ungroup()

syll_e1 <- syll_e %>%
  rename(freq = n) %>%
  rename(s = syll_e) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f0 <- ggplot(data = syll_e1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#96ffa4", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,3800)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), alpha = .7, size = 3.5, hjust = 0)

f0

#ggsave("f0.png", dpi=600)
2. Syllables without tones

This figure presents the top 20 used syllables without tones out of 404 types in Taiwan Mandarin spontaneous speech.

notone <- str_replace_all(syll$syll_e, "[0-9]", "") 

df <- tibble::tibble(syll = notone)

syll_notone <- df %>%
  filter(!is.na(syll)) %>%
  group_by(syll) %>%
  count() %>%
  ungroup()

syll_notone1 <- syll_notone %>%
  rename(freq = n) %>%
  rename(s = syll) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f1 <- ggplot(data = syll_notone1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#ffe100", alpha=.6, width=.4)+
  scale_y_continuous(expand = c(0,0), limit = c(0,4800)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)

f1

#ggsave("f1.png", dpi=600)
3. Tones in syllable level

This figure presents the tone frequency in a syllable level in Taiwan Mandarin spontaneous speech.

syll_tone <- str_replace_all(syll$syll_e, "[:alpha:]", "")
syll_tone <- syll_tone[!str_detect(syll_tone, "[:blank:]")]
syll_tone <- str_replace_all(syll_tone, "[:blank:]", "")
syll_tone <- str_replace_all(syll_tone, "[:punct:]", "")

df_tone <- tibble::tibble(syll_tone = syll_tone)

df_tone1 <- df_tone %>%
  filter(!is.na(syll_tone)) %>%
  filter(!syll_tone == "") %>%
  group_by(syll_tone) %>%
  count() %>%
  ungroup()

df_tone1 <- df_tone1 %>%
  rename(freq = n) %>%
  rename(s = syll_tone) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(5, freq) %>%
  mutate(s = fct_reorder(s, freq))

f2 <- ggplot(data = df_tone1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,31000)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)

f2

#ggsave("f2.png", dpi=600)
4. Characters

This figure presents the top 20 used syllables out of 1911 types in characters in Taiwan Mandarin spontaneous speech.

syll_c <- syll %>%
  select(syll_c) %>%
  rename(s = syll_c) %>%
  group_by(s) %>%
  count() %>%
  ungroup()

syll_c1 <- syll_c %>%
  rename(freq = n) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f3 <- ggplot(data = syll_c1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#73d3ff", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,3500)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7,  hjust = 0)

f3

#ggsave("f3.png", dpi=600)

Word frequency

This part presents frequencies in word level. There are 56950 tokens in total.

1. Chinese words

This figure presents the top 20 used words out of 4934 types in Taiwan Mandarin spontaneous speech.

word_c <- word %>%
  group_by(word_c) %>%
  count() %>%
  ungroup()

word_c1<- word_c %>%
  rename(freq = n) %>%
  rename(s = word_c) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f4 <- ggplot(data = word_c1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#c2adf0", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7,  hjust = 0)

f4

#ggsave("f4.png", dpi=600)
2. Romanized words

This figure presents the top 20 used romanized words out of 4899 types in Taiwan Mandarin spontaneous speech.

word_e <- word %>%
  group_by(word_e) %>%
  count() %>%
  ungroup()

word_e1<- word_e %>%
  rename(freq = n) %>%
  rename(s = word_e) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f5 <- ggplot(data = word_e1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#f7bce6", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7,  hjust = 0)

f5

#ggsave("f5.png", dpi=600)
3. Romanized words without tone

This figure presents the top 20 used romanized words without tones out of 4272 types in Taiwan Mandarin spontaneous speech.

word_notone <- str_replace_all(word$word_e, "[0-9]", "") 

df <- tibble::tibble(word_e = word_notone)
df1 <- df %>%
  group_by(word_e) %>%
  count() %>%
  ungroup()

df2 <- df1 %>%
  rename(freq = n) %>%
  rename(s = word_e) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(20, freq) %>%
  mutate(s = fct_reorder(s, freq))

f6 <- ggplot(data = df2, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#5de8d8", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7,  hjust = 0)

f6

#ggsave("f6.png", dpi=600)
4. Tones in word level

This figure presents the top 15 used tone combination out of 222 types in word level in Taiwan Mandarin spontaneous speech.

tone <- str_replace_all(word$word_e, "[:alpha:]", "") 
tone <- str_replace_all(tone, "[:blank:]", "") 
tone <- str_replace_all(tone, "[:punct:]", "") 

df <- tibble::tibble(tone = tone)
tone_c <- df %>%
  filter(!tone == "") %>%
  group_by(tone) %>%
  count() %>%
  ungroup()

tone_c1 <- tone_c %>%
  rename(freq = n) %>%
  rename(s = tone) %>%
  mutate(s = factor(s, levels = s)) %>%
  arrange(desc(freq)) %>%
  top_n(15, freq) %>%
  mutate(s = fct_reorder(s, freq))

f7 <- ggplot(data = tone_c1, mapping = aes(x = s, y = freq)) +
  geom_bar(stat="identity", fill="#577bde", alpha=.6, width=.4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,13000)) +
  coord_flip() +
  xlab("") +
  theme_bw() +
  geom_text(aes(label = freq), size = 3.5, alpha = .7,  hjust = 0)

f7

#ggsave("f7.png", dpi=600)
The propotion of tone combination in each tone.
t4 <- df$tone[str_detect(df$tone,"4")]
t3 <- df$tone[str_detect(df$tone,"3")]
t2 <- df$tone[str_detect(df$tone,"2")]
t1 <- df$tone[str_detect(df$tone,"1")]
t0 <- df$tone[str_detect(df$tone,"0")]

tone_w_4 <- df %>%
  filter(!tone == "") %>%
  filter(tone %in% t4) %>%
  mutate(t = df_tone1$s[1])

tone_w_3 <- df %>%
  filter(!tone == "") %>%
  filter(tone %in% t3) %>%
  mutate(t = df_tone1$s[2])

tone_w_1 <- df %>%
  filter(!tone == "") %>%
  filter(tone %in% t1) %>%
  mutate(t = df_tone1$s[3])

tone_w_2 <- df %>%
  filter(!tone == "") %>%
  filter(tone %in% t2) %>%
  mutate(t = df_tone1$s[4])

tone_w_0 <- df %>%
  filter(!tone == "") %>%
  filter(tone %in% t0) %>%
  mutate(t = df_tone1$s[5])

list_tone <- list(tone_w_4,tone_w_3,tone_w_2,tone_w_1,tone_w_0)
tone_p <- Reduce(function(x,y){bind_rows(x,y)}, list_tone)


tone_p1 <- tone_p %>%
  mutate(len = str_length(tone)) %>%
  filter(len <= 2) %>%
  group_by(t, tone) %>%
  count() %>%
  rename(freq = n)

f8 <- ggplot(data = tone_p1, mapping = aes(x = t, y = freq, fill = tone),
             position="stack") +
  geom_bar(position="stack", stat="identity", alpha=.6) +
  scale_y_continuous(expand = c(0,0), limit = c(0,24000)) +
  geom_text(aes(label = tone), size = 4, alpha=.7, position = position_stack(vjust = 0.5))
f8

#ggsave("f8.png", dpi=600, width = 7, height = 5)
To see the influence of POS

POS labels were created by CKIP package on colab.

pos <- read_csv("POS.csv")

word1 <- word %>%
  mutate(POS = pos$POS, tone = tone)

C <- c("Caa", "Cbb")
ADV <- c("Da", "Dfa", "Dfb", "D", "Dk")
POST <- c("Cab", "Cba", "Neqb", "Ng")
ASP <- c("Di")
N <- c("Na", "Nb", "Nc", "Nd", "Ncd", "Nh")
DET <- c("Neu", "Nes", "Nep", "Neqa")
M <- c("Nf")
T1 <- c("I", "T", "DE")
Vi <- c("VA", "VB", "VH", "VI")
Vt <- c("VAC", "VC", "VCL", "VD", "VE", "VF", "VG", "VHC", "VJ", "VK", "VL", "SHI", "V_2")

for (i in seq(length(word1$POS))) {
  if (word1$POS[i] %in% C) {
    word1$POS[i] <- "C"
  } else if (word1$POS[i] %in% ADV) {
    word1$POS[i] <- "ADV"
  } else if (word1$POS[i] %in% POST) {
    word1$POS[i] <- "POST"
  } else if (word1$POS[i] %in% ASP) {
    word1$POS[i] <- "ASP"
  } else if (word1$POS[i] %in% N) {
    word1$POS[i] <- "N"
  } else if (word1$POS[i] %in% DET) {
    word1$POS[i] <- "DET"
  } else if (word1$POS[i] %in% M) {
    word1$POS[i] <- "M"
  } else if (word1$POS[i] %in% T1) {
    word1$POS[i] <- "T"
  } else if (word1$POS[i] %in% Vi) {
    word1$POS[i] <- "Vi"
  } else if (word1$POS[i] %in% Vt) {
    word1$POS[i] <- "Vt"
  } else if (word1$POS[i] == "COMMACATEGORY") {
    word1$POS[i] <- "ASP"
  } else if (word1$POS[i] == "PERIODCATEGORY") {
    word1$POS[i] <- "DET"
  }
}

##tokens
word_c_POS1 <- word1%>%
  select(POS) %>%
  mutate(all = length(POS)) %>%
  group_by(POS) %>%
  summarise(n = n(), p = 100*n/all) %>%
  select(-n) %>%
  mutate(type = "Tokens", speaker = "All speakers") %>%
  distinct()

id_len <- word1 %>%
  select(id) %>%
  group_by(id) %>%
  count() %>%
  rename(all = n)

word_c_POS3 <- word1 %>%
  select(word_c, id, POS) %>%
  mutate(type = "Tokens", speaker = "Average Speaker") %>%
  group_by(POS, id, type, speaker) %>%
  count() 

POS_ave <- right_join(word_c_POS3, id_len, by = "id")

POS_ave1 <- POS_ave %>%
  group_by(POS, id, type, speaker) %>%
  summarise(p = 100*n/all) %>%
  group_by(type, POS, speaker) %>%
  summarise(p = mean(p))

##types
pos_type <- word1 %>%
  group_by(POS, word_c) %>%
  count() %>%
  ungroup()

word_c_POS2 <- pos_type %>%
  select(POS) %>% 
  mutate(all = length(POS)) %>%
  mutate(type = "Types", speaker = "All speakers") %>%
  group_by(POS, type, speaker) %>%
  summarise(n = n(), p = 100*n/all) %>%
  distinct()

word_c_t<- word1 %>%
  group_by(word_c, id, POS) %>%
  count() %>%
  ungroup() 

id_len_t  <- word_c_t %>%
  select(id) %>%
  group_by(id) %>%
  count() %>%
  rename(all = n)

word_pos_ave <- word_c_t %>%
  select(POS, word_c, id) %>%
  mutate(type = "Types", speaker = "Average Speaker") %>%
  group_by(POS, id, type, speaker) %>%
  count() 

pos_ave <- right_join(word_pos_ave, id_len_t, by = "id")

pos_ave1 <- pos_ave %>%
  group_by(POS, id, type, speaker) %>%
  summarise(p = 100*n/all) %>%
  group_by(type, POS, speaker) %>%
  summarise(p = mean(p))

pos_token <- bind_rows(word_c_POS1, POS_ave1)
pos_type <- bind_rows(word_c_POS2, pos_ave1)

pos_data <- bind_rows(pos_token, pos_type)

pos_data1 <- pos_data %>%
  ungroup() %>%
  mutate(POS = as.factor(POS)) %>%
  arrange(POS) %>%
  mutate(POS = fct_reorder(POS, p))

f12 <- ggplot(data = pos_data1, mapping = aes(x = type, y = p, fill = POS), 
             position = "stack") +
  geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
  geom_text(aes(label = POS), size = 3, alpha=.7, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)) +
  labs(x = "", y = "Percentage of words in corpus") +
  facet_grid(~speaker)

f12

The relationship between POS and tone distribution
word_tone_POS <- word1 %>%
  select(tone, POS) %>%
  filter(tone %in% tone_p$tone) %>%
  group_by(tone, POS) %>%
  count() %>%
  filter(n > 10) %>%
  arrange(desc(n))

knitr::kable(word_tone_POS[1:50,])
tone POS n
0 T 4340
4 ADV 3696
3 N 3393
4 Vt 2867
4 DET 2247
3 Vt 1624
0 M 1348
1 N 1289
3 ADV 1285
4 P 1224
44 N 1068
1 Vt 934
24 N 896
24 ADV 815
2 ADV 814
44 ADV 744
1 ADV 717
4 Vi 637
42 N 550
30 N 533
34 ADV 501
2 N 492
14 C 490
3 Vi 475
1 DET 474
14 N 472
11 N 445
20 Vt 445
43 N 437
44 Vt 416
3 DET 412
4 N 399
32 N 397
33 C 397
41 N 395
44 Vi 391
23 N 384
14 Vt 365
2 DET 364
12 N 363
4 M 362
1 P 351
34 N 336
10 N 332
23 ADV 330
21 N 327
0 ASP 297
2 Vt 297
44 C 291
12 ADV 285
The specific words contributed to the distribution of POS and tones
pos_word <- word1 %>%
  select(tone, POS, word_c) %>%
  group_by(tone, POS, word_c) %>%
  count() %>%
  arrange(desc(n))

knitr::kable(pos_word[1:50,])
tone POS word_c n
3 N 2707
0 T 2562
4 Vt 1647
4 DET 1508
0 M 1345
4 ADV 1214
3 Vt 833
1 N 745
4 P 596
3 ADV 594
4 DET 565
0 T 552
24 ADV 然後 539
1 ADV 517
30 N 我們 497
3 ADV 493
14 C 因為 485
3 N 476
4 ADV 450
20 Vt 覺得 445
4 ADV 444
24 N 時候 422
44 ADV 就是 421
4 ADV 407
1 DET 404
33 C 所以 396
0 T 384
1 Vt 369
4 ADV 326
1 P 320
2 N 305
2 DET 291
10 N 他們 266
0 ASP 262
20 DET 什麼 262
3 Vi 256
4 Vi 255
4 ADV 254
34 ADV 比較 252
2 ADV 249
4 P 248
44 N 現在 234
3 Vt 230
1 Vt 228
0 T 221
4 Vt 216
3 M 204
23 ADV 沒有 190
0 T 178
44 C 但是 174
Word length distribution

The all speaker vs. average speaker

len <- str_length(word1$word_c)

id_len <- word1 %>%
  select(id) %>%
  group_by(id) %>%
  count() %>%
  rename(all = n)

all_tokens <- length(word$word_c)
all_types <- length(word_c$word_c)

## word tokens
word_len_ave <- word1 %>%
  select(word_c, id) %>%
  mutate(len = len, type = "Tokens", speaker = "Average Speaker") %>%
  group_by(len, id, type, speaker) %>%
  count() 

ave <- right_join(word_len_ave, id_len, by = "id")

ave1 <- ave %>%
  group_by(len, id, type, speaker) %>%
  summarise(p = n/all) %>%
  group_by(type, len, speaker) %>%
  summarise(m = mean(p))

word_len_all <- word1 %>%
  select(word_c) %>%
  mutate(len = len, type = "Tokens", speaker = "All Speakers") %>%
  group_by(type, len, speaker) %>%
  summarise(n = n(), m = n/all_tokens) %>%
  select(-n)

token_all_ave <- bind_rows(word_len_all, ave1) 

##word types
word_c_t <- word %>%
  group_by(word_c, id) %>%
  count() %>%
  ungroup()

len_t <- str_length(word_c_t$word_c)

id_len_t <- word_c_t %>%
  select(id) %>%
  group_by(id) %>%
  count() %>%
  rename(all = n)

word_type_ave <- word_c_t %>%
  select(word_c, id) %>%
  mutate(len = len_t, type = "Types", speaker = "Average Speaker") %>%
  group_by(len, id, type, speaker) %>%
  count() 

type_ave <- right_join(word_type_ave, id_len_t, by = "id")

type_ave1 <- type_ave %>%
  group_by(len, id, type, speaker) %>%
  summarise(p = n/all) %>%
  group_by(type, len, speaker) %>%
  summarise(m = mean(p))

len_t_all <- str_length(word_c$word_c)
word_type_all <- word_c %>%
  select(word_c) %>%
  mutate(len = len_t_all, type = "Types", speaker = "All Speakers") %>%
  group_by(type, len, speaker) %>%
  summarise(n = n(), m = n/all_types) %>%
  select(-n)

type_all_ave <- bind_rows(type_ave1, word_type_all)

len_data <- bind_rows(type_all_ave, token_all_ave)
len_data1 <- len_data %>%
  ungroup() %>%
  mutate(len = as.factor(len), m = 100*m) %>%
  arrange(len) %>%
  mutate(len = fct_reorder(len, m))

f9 <- ggplot(data = len_data1, mapping = aes(x = type, y = m, fill = len), 
             position = "stack") +
  geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
  scale_fill_manual( 
        values = c("royalblue", "skyblue", "blue", "darkblue","navy","black"), 
        limits = c("1", "2", "3", "4", "5", "6"), 
        breaks =c("1", "2", "3", "4", "5", "6"), 
        name = "syll_len", labels = c("1", "2", "3", "4", "5", "6")) +
  geom_text(aes(label = round(m)), size = 4, alpha=.7, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)) +
  labs(x = "", y = "Percentage of words in corpus") +
  facet_grid(~speaker)

#ggsave("f9.png", dpi = 600)
f9

Word length distribution differences between ages and genders
##word tokens
len <- str_length(word1$word_c)

word_len_age <- word1 %>%
  select(word_c, age, gender) %>%
  mutate(len = len) %>%
  group_by(age, gender, len) %>%
  count()

age_all <- word_len_age %>%
  group_by(age, gender) %>%
  summarise(all = sum(n))
  
word_len_age1 <- right_join(word_len_age, age_all, by = c("age", "gender"))

word_len_age2 <- word_len_age1 %>%
  mutate(type = "Tokens") %>%
  group_by(age, gender, len, type) %>%
  summarise(m = n/all) 

##word types
word_c_t_a <- word %>%
  group_by(word_c, age, gender) %>%
  count() %>%
  ungroup()

len_t_a <- str_length(word_c_t_a$word_c)

age_len_t <- word_c_t_a %>%
  select(age, gender) %>%
  group_by(age, gender) %>%
  count() %>%
  rename(all = n)

word_t_ave <- word_c_t_a %>%
  select(word_c, age, gender) %>%
  mutate(len = len_t_a, type = "Types") %>%
  group_by(len, age, gender, type) %>%
  count() 

t_ave <- right_join(word_t_ave, age_len_t, by = c("age", "gender"))

t_ave1 <- t_ave %>%
  group_by(len, age, gender, type) %>%
  summarise(p = n/all) %>%
  group_by(age, gender, type, len) %>%
  summarise(m = mean(p))


len_age <- bind_rows(word_len_age2, t_ave1)
len_age1 <- len_age %>%
  ungroup() %>%
  mutate(len = as.factor(len), m = 100*m) %>%
  arrange(len) %>%
  mutate(len = fct_reorder(len, m))

f10 <- ggplot(data = len_age1, mapping = aes(x = type, y = m, fill = len), 
             position = "stack") +
  geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
  geom_text(aes(label = round(m)), size = 3, alpha=.7, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)) +
  labs(x = "", y = "") +
  facet_grid(gender~ age)

#ggsave("f10.png", dpi= 600)
f10

Word length distribution differences between males and females
len <- str_length(word1$word_c)
word_len_gender <- word1 %>%
  select(word_c, gender) %>%
  mutate(len = len) %>%
  group_by(gender, len) %>%
  count()

gender_all <- word_len_gender %>%
  group_by(gender) %>%
  summarise(all = sum(n))
  
word_len_gender1 <- right_join(word_len_gender, gender_all, by = "gender")

word_len_gender2 <- word_len_gender1 %>%
  mutate(type = "Tokens") %>%
  group_by(gender, len, type) %>%
  summarise(m = n/all) 

##word types
word_c_t_g <- word %>%
  group_by(word_c, gender) %>%
  count() %>%
  ungroup()

len_t_g <- str_length(word_c_t_g$word_c)

gender_len_t <- word_c_t_g %>%
  select(gender) %>%
  group_by(gender) %>%
  count() %>%
  rename(all = n)

word_g_ave <- word_c_t_g %>%
  select(word_c, gender) %>%
  mutate(len = len_t_g, type = "Types") %>%
  group_by(len, gender, type) %>%
  count() 

t_g <- right_join(word_g_ave, gender_len_t, by = "gender")

t_g1 <- t_g %>%
  group_by(len, gender, type) %>%
  summarise(p = n/all) %>%
  group_by(gender, type, len) %>%
  summarise(m = mean(p))


len_gender <- bind_rows(word_len_gender2, t_g1)
len_gender1 <- len_gender %>%
  ungroup() %>%
  mutate(len = as.factor(len), m = 100*m) %>%
  arrange(len) %>%
  mutate(len = fct_reorder(len, m))

f11 <- ggplot(data = len_gender1, mapping = aes(x = type, y = m, fill = len), 
             position = "stack") +
  geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
  geom_text(aes(label = round(m)), size = 4, alpha=.7, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)) +
  labs(x = "", y = "") +
  facet_grid(~ gender)

#ggsave("f11.png", dpi = 600)
f11